###########################################################
# Kerice Doten-Snitker
# 
# 
# Built under R 3.4.3 
# Platform: x86_64-apple-darwin15.6.0 (64-bit)
###########################################################

# # set locale to preempt UTF-8 issues with the mydata
# Sys.getlocale()
# Sys.setlocale(category = "LC_ALL", locale = "en_US.UTF-8")
# 
# #for windows
# #Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
# 
# # add packages with the pacman library, which installs if not installed
# library(pacman)
# # use the here package for improving replicability
# p_load(here)
# # for data manipulation
# p_load(magrittr, plyr, tidyverse, reshape2)
# 
# set.seed(1418)

###
# Observations are settlements
# with certain Jewish presence, that is, Juden!=88,
# AND no nichtstaedtische settlements.
###

# add the annualized dominion data to the R environment 
mydata_d1 <- read.csv(here::here("in_data","Haverkamp_Ortskatalog_limited_dominion.csv"), 
  header=TRUE, encoding = "latin1", stringsAsFactors=FALSE)

#length(unique(mydata_d1$PlaceID))
#length(mydata_d1$PlaceID[!is.na(mydata_d1$Dom1Year)])

# strip the rows with no year data
mydata_d2 <- mydata_d1 %>% 
  filter(!is.na(Dom1Year))

# lose any settlements when omitting empty rows? 3 that had 99's
setdiff(unique(mydata_d$PlaceID), unique(mydata_d2$PlaceID))

write.csv(mydata_d2,here::here("Diffusion","out_files","Dominion_periods.csv"), row.names=FALSE)

###########################################################
# Task: Change from year ranges to having all years for each settlement ####
###########################################################

# 1: Change from wide data to long ----------------------------------------

# I want to end up with [PlaceName, PlaceID, DomYear, DomParty, DomTrans]
# code found here: https://stackoverflow.com/questions/12466493/reshaping-multiple-sets-of-measurement-columns-wide-format-into-single-columns
# can also use to_long in sjmisc package
mydata_d3 <-reshape(mydata_d2, idvar=c("PlaceID", "PlaceName"), 
            direction="long", 
             varying=list(Year=c(3,6,9,12,15,18,21,24,27), 
               Party=c(4,7,10,13,16,19,22,25,28), 
               Trans=c(5,8,11,14,17,20,23,26,29), #all Dom1Trans=NA
               End=c(6,9,12,15,18,21,24,27,5)), 
  # useful for sequencing years to include NAs from Dom1Trans with last group
             v.names = c("DomYear", "DomParty", "DomTrans","DomEnd"))
#str(mydata_d3)
length(unique(mydata_d3$PlaceID)) # check preservation
mydata_d3 <- mydata_d3[order(mydata_d3$PlaceID),]
mydata_d4 <- mydata_d3[!is.na(mydata_d3$DomYear),] # strip empty yr/party/trans
#nrow(mydata_d4)
mydata_d4$DomEnd[is.na(mydata_d4$DomEnd)] <- 1521 #829 NA: 1 for each sett.

# 2: Assign identifiers to parties ------------------------------------------

parties <- mydata_d4$DomParty %>%
 str_c(sep=", ", collapse=", ") %>% # one string, separated by ,\s
 str_split(", ", simplify = TRUE) %>% # then split on ,\s
 str_replace("\\.\\s", "_") %>% # replace .\s in party names with _;
 str_replace("\\.", "_") %>% # . in party names with _;
 str_replace_all(" ", "_") %>% # and remaining \s with _
 unique() %>% # remove any duplicates
 sort() # alphabetize, for easier manual recoding

write.csv(parties,here::here("Diffusion","out_files","parties.csv"), row.names=FALSE, fileEncoding = "latin1")

# manual recoding: 1) same party, 2) party type

# 3: Apply party codes to dominion data ------------------------

# add the coded dominion data to the R environment 
parties_coded <- read.csv(here::here("Diffusion","out_files","parties_coded.csv"), header=TRUE, encoding = "latin1", stringsAsFactors=FALSE) # included in files

# duplicate "DomParty" so names match easier-to-use versions in parties file

mydata_d4$DomParty2 <- mydata_d4$DomParty %>%
  str_replace_all("\\.\\s", "_") %>% # replace .\s in party names with _;
  str_replace_all("\\.", "_") %>% # . in party names with _;
  str_replace_all("\\b\\s", "_") %>% # and \s after a word boundary with _
  str_extract_all("[\\w|-]+\\b") # then extract items to make list col

# function for adding data from the parties table to the main data

list.lookup <- function(maindata, listcol, lookupdata, refcol){
  # create column where added data will go
  #maindata$stored <- list(rep(NULL, dim(maindata)[1]))
  # loop through rows of the listcolum
  for (i in 1:dim(maindata)[1]){
    # create placeholder
    target_row <- rep(NA, length(maindata[,c(listcol)][[i]]))
    # loop through each item within the row's list
    for (j in 1:length(maindata[,c(listcol)][[i]])){
      # identify which row from the lookup data matches the list item
      target_row[j] <- match(maindata[,c(listcol)][[i]][[j]], lookupdata[,c(refcol)])
      # grab the lookup data minus the overlapping column
      # create a mini dataframe of the lookup data for all list items
      target_info <- lookupdata[target_row[j],
        -which(names(lookupdata) %in% c(refcol))]
      
      # store the mini dataframe the new list column
      maindata$stored[[i]][j] <- list(target_info)
      # reset
      target_info <- NULL
      # combine mini dataframes to make one for each row
      maindata$stored2[[i]] <- map_df(maindata$stored[[i]], rbind)
    }
  }
  
  
  # set up new columns where mini dataframe data will go
  for (k in 1:length(maindata$stored2[[1]])){
    if(is.numeric(maindata$stored2[[1]][,k])){
      maindata[colnames(maindata$stored2[[i]])[k]] <- rep(NA, dim(maindata)[1])
    }
  }
  
  
  # aggregate the mini dataframe and pull data into new columns
  # loop through rows of the listcolum
  for (i in 1:dim(maindata)[1]){
  # loop through each column in the mini dataframe
    for (w in 1:length(maindata$stored2[[1]])){
      # sum values in numeric columns to get counts
      if(is.numeric(maindata$stored2[[1]][,w])) {
        maindata[i,c(colnames(maindata$stored2[[i]])[w])] <- sum(maindata$stored2[[i]][w])
      } # end if statment
    } # end for-loop with w
  } # end for-loop with i
  
  return(maindata)
  
} # end function


mydata_d5 <- list.lookup(maindata=mydata_d4,
  listcol = "DomParty2",
  lookupdata = parties_coded,
  refcol = "x")

# 4: Change from one row per dominion tenure to annual observations ---------

mydata_d6 <- mydata_d5 %>%
  mutate(Years = Map(seq, DomYear, DomEnd-1)) %>% # enumeration of years
  dplyr::select(-DomEnd, -time, -DomParty2, -stored, -stored2) %>%
  unnest(cols = c(Years)) # unnest the list-column of years, keep other columns
 
# DomTrans for all dominion periods with no transition (no prior period) = 8
mydata_d6$DomTrans[is.na(mydata_d6$DomTrans)] <- 8

# 5: Recoding and new variables ---------------------------------------------

mydata_d <- mydata_d6 %>%
  mutate(
    DomChg = ifelse(DomYear==Years, 1, 0), # binary dominion transition year
    DomTenure = Years - DomYear, # tenure - n years since dominion transition
    AuthTotal = King + Prince + Lord + minor + imperial + free + city +
      RelFund + RelHouse + Archbishop + Bishop, # total n of ruling parties
    AuthMajor = King + Prince + Lord + free + 
      Archbishop + Bishop, # total n of high-rank non-imperial ruling parties
    AuthRel = ifelse((RelFund >0 | RelHouse >0 | Archbishop >0 | Bishop >0),
      1,0), # binary at least one authority is religious
    AuthEpisc = ifelse((Archbishop >0 | Bishop >0),
                     1,0), # binary at least one authority is episcopal
    AuthNob = ifelse((King >0 | Prince >0 | Lord >0 | minor >0),
      1,0), # binary at least one authority is nobility
    AuthRelOnly = ifelse(
      (RelFund >0 | RelHouse >0 | Archbishop >0 | Bishop >0) &
        (King + Prince + Lord + minor + imperial + free + city == 0),1,0),
    AuthNobOnly = ifelse((King >0 | Prince >0 | Lord >0 | minor >0) &
        (imperial + free + city + RelFund + RelHouse + Archbishop + Bishop == 0),1,0),
    AuthFreeImp = ifelse((imperial >0 | free >0 | city >0) &
        (King + Prince + Lord + minor + 
            RelFund + RelHouse + Archbishop + Bishop == 0),1,0),
    AuthShare = ifelse(AuthTotal>1,1,0),
    DomRts = ifelse(DomTrans==1,1,0),
    DomSld = ifelse(DomTrans==2,1,0),
    DomMtg = ifelse(DomTrans==3,1,0),
    DomFf = ifelse(DomTrans==4,1,0),
    DomCqr = ifelse(DomTrans==5,1,0),
    DomFam = ifelse(DomTrans==6,1,0),
    DomOth = ifelse(DomTrans==7,1,0),
    KingBin = ifelse(King>=1,1,0),
    PrinceBin = ifelse(Prince>=1,1,0),
    LordBin = ifelse(Lord>=1,1,0),
    minorBin = ifelse(minor>=1,1,0),
    imperialBin = ifelse(imperial>=1,1,0),
    freeBin = ifelse(free>=1,1,0),
    cityBin = ifelse(city>=1,1,0),
    RelFundBin = ifelse(RelFund>=1,1,0),
    RelHouseBin = ifelse(RelHouse>=1,1,0),
    ArchbishopBin = ifelse(Archbishop>=1,1,0),
    BishopBin = ifelse(Bishop>=1,1,0),
    AuthTypes = KingBin + PrinceBin + LordBin + minorBin + imperialBin + 
      freeBin + cityBin + RelFundBin + RelHouseBin + 
      ArchbishopBin + BishopBin
  )

write.csv(mydata_d6,here::here("Diffusion","out_files", "dominion_annual.csv"), row.names=FALSE)

